In [ ]:
%%HTML
<style>
.container { width:100% }
</style>

Corpora Examples


In [ ]:
import nltk

In [ ]:
# nltk.download()

In [ ]:
from nltk.corpus import gutenberg

Show the titles of those books that are part of the gutenberg corpus.


In [ ]:
gutenberg.fileids()

Retrieve the book "Paradise Lost" by John Milton.


In [ ]:
print(gutenberg.raw('milton-paradise.txt'))

Retrieve the book "Paradise Lost" by John Milton as a list of sentences.


In [ ]:
print(gutenberg.sents('milton-paradise.txt'))

Retrieve the book "Paradise Lost" by John Milton as a list of words.


In [ ]:
print(gutenberg.words('milton-paradise.txt'))

In [ ]:
from nltk.corpus import brown

In [ ]:
brown.ensure_loaded()
help(brown)

In [ ]:
brown.fileids()

In [ ]:
brown.categories()

In [ ]:
brown.sents(categories='editorial')

Tokenizing


In [ ]:
from nltk.tokenize import sent_tokenize

In [ ]:
generic_text = 'Lorem ipsum dolor sit amet, amet minim temporibus in sit. Vel ne impedit consequat intellegebat.'

The function sent_tokenize splits a string into a list of sentences.


In [ ]:
sent_tokenize(generic_text)

In [ ]:
english_text = 'Where is the closest train station? I need to reach London.'
sent_tokenize(english_text)

In [ ]:
spanish_text = '¿Dónde está la estación más cercana? Inmediatamente me tengo que ir a Barcelona.'
sent_tokenize(spanish_text, language='spanish')

In [ ]:
from nltk.tokenize import TreebankWordTokenizer

In [ ]:
simple_text = 'This is a simple text.'
tbwt        = TreebankWordTokenizer()
tbwt.tokenize(simple_text)

In [ ]:
help(TreebankWordTokenizer)

In [ ]:
from nltk.tokenize import RegexpTokenizer

In [ ]:
complex_text = "This isn't a simple text."

In [ ]:
ret = RegexpTokenizer('[a-zA-Z\']+')

In [ ]:
ret.tokenize(complex_text)

In [ ]:
complex_text = 'This isn\'t a simple text. Count 1, 2, 3 and then go!'

In [ ]:
ret.tokenize(complex_text)

Stopword Removal


In [ ]:
from nltk.corpus import stopwords

In [ ]:
sw = set(stopwords.words('english'))
sw

In [ ]:
len(sw)

In [ ]:
complex_text = 'This isn\'t a simple text. Count 1, 2, 3 and then go!'
tokens = ret.tokenize(complex_text)
clean_tokens = [t for t in tokens if t not in sw]
clean_tokens

Language Detection


In [ ]:
from langdetect import detect, detect_langs

In [ ]:
detect('This is German')

In [ ]:
detect_langs('I really love you mon doux amour!')

In [ ]:
detect('I really love you mon doux amour!')

Stemming


In [ ]:
from nltk.stem.snowball import SnowballStemmer

In [ ]:
ess = SnowballStemmer('english', ignore_stopwords=True)

In [ ]:
ess.stem('flies')

In [ ]:
from nltk.stem.snowball  import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [ ]:
ps = PorterStemmer()
ps.stem('teeth')

In [ ]:
ls = LancasterStemmer()
ls.stem('teeth')

Vectorization


In [ ]:
from sklearn.feature_extraction.text import CountVectorizer

In [ ]:
corpus = [ 'This is a simple test corpus',
           'A corpus is a set of text documents',
           'We want to analyze the corpus and the documents',
           'Documents can be automatically tokenized'
         ]

In [ ]:
cv = CountVectorizer()

In [ ]:
vectorized_corpus = cv.fit_transform(corpus)

In [ ]:
vectorized_corpus.todense()

In [ ]:
cv.vocabulary_

In [ ]:


In [ ]: